library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
card_data <- read_csv("data/CardBase.csv") %>%
clean_names()
## Rows: 500 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Card_Number, Card_Family, Cust_ID
## dbl (1): Credit_Limit
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
customer_data <- read_csv("data/CustomerBase.csv") %>%
clean_names()
## Rows: 5674 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Cust_ID, Customer_Segment, Customer_Vintage_Group
## dbl (1): Age
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
fraud_data <- read_csv("data/FraudBase.csv") %>%
clean_names()
## Rows: 109 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Transaction_ID
## dbl (1): Fraud_Flag
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
transaction_data <- read_csv("data/TransactionBase.csv") %>%
clean_names()
## Rows: 10000 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Transaction_ID, Transaction_Date, Credit_Card_ID, Transaction_Segment
## dbl (1): Transaction_Value
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
join card details and customer details by card id- so that all record of card details and any matching records in customer details are kept.
customer_data = 5674x4 card_data = 500x4
expected output= (500(plus matches if any)) x (4 + (3 from customer_data))
result shows a 1:1 relationship for customer_id
left_join(card_data, customer_data, by = c("cust_id" = "cust_id"))
Join fraud details with transaction details keep - all rows in both tables
transaction_data = 10,000 x 5 fraud_data = 109 x 2
expected output - (10,000 + additional matches) x 2+4
results- tell you that 1:1 relationship between transaction_id all transactions in the fraud data base are accounted for in the transaction_data
full_join(fraud_data, transaction_data, by = c("transaction_id" = "transaction_id"))
Join card details with transaction details keep - rows from first match the second are returned only return results from 1st table once
card_data = 500x4 transaction_data = 10000 x 5 relationship - card:transaction = 1:many
result - 10000 x 3+5
right_join(card_data, transaction_data, by = c("card_number" = "credit_card_id"), keep = FALSE)
card_data
transaction_data
right_join(transaction_data, card_data, by = c("credit_card_id" = "card_number"), keep = FALSE)
Read in data sets
hats <- read_csv("data/hat_observations.csv")
## Rows: 100 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): observation
## dbl (4): day, month, year, observation_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
hat_separated <- hats %>%
separate(observation,
into = c("hat_colour", "hat_type"),
sep = ",")
hat_separated
hat_dates <- hat_separated %>%
unite(date,
c(day, month, year),
sep = "/") %>%
filter(hat_type == "beret") %>%
slice_max(observation_count)
hat_dates
cand_number = cust_id, card_number = credit_card_id, transaction_id = transaction_id
left_join(card_data, customer_data, by = c("cust_id" = "cust_id")) %>%
left_join(transaction_data, by = c("card_number" = "credit_card_id")) %>%
right_join(fraud_data, by = c("transaction_id" = "transaction_id"))
exam_score <- read_csv("data/exam_scores.csv")
## Rows: 200 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (11): id, exam_Q1, exam_Q2, exam_Q3, exam_Q4, exam_Q5, exam_Q6, exam_Q7,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Read in exam_scores and transform it into long format with two new columns exam_question and score. Then, using separate and select, remove superfluous information from the values in exam_question
# Don't need separate and select?
exam_score %>%
pivot_longer(cols = starts_with("exam"),
names_to = "exam_question",
names_prefix = "exam_Q",
values_to = "score")
# Completing as per the question
exam_score %>%
pivot_longer(cols = starts_with("exam"),
names_to = "exam_question",
values_to = "score") %>%
separate(exam_question, into = c("temp", "exam_question"),
sep = "Q") %>%
select(-temp)